Load Data

data_dir='/Users/robert/Documents/UMN/air_pollution/'
DF <- read.csv(paste0(data_dir, 'texas.csv'), stringsAsFactors=FALSE)
DF$Date <- ymd(DF$Date_Local)
DF$Date_Local <- NULL
DF$Year <- year(DF$Date)
DF$Longitude <- as.numeric(DF$Longitude)
DF$Latitude <- as.numeric(DF$Latitude)

Describe the data:

describe(DF)
## DF 
## 
##  16  Variables      4119327  Observations
## ---------------------------------------------------------------------------
## State_Code 
##       n missing  unique    Info    Mean 
## 4119327       0       1       0      48 
## ---------------------------------------------------------------------------
## County_Code 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 4119327       0      29    0.99   204.3      29      29     113     201 
##     .75     .90     .95 
##     303     439     453 
## 
## lowest :  29  43  61 113 121, highest: 375 439 453 479 485 
## ---------------------------------------------------------------------------
## Site_Num 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 4119327       0      54       1   387.8       3       5      24      78 
##     .75     .90     .95 
##     677    1042    1051 
## 
## lowest :    1    2    3    4    5, highest: 2004 3003 3008 3009 3011 
## ---------------------------------------------------------------------------
## Latitude 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 4119327       0      66       1    30.4   27.43   28.70   29.30   29.90 
##     .75     .90     .95 
##   31.84   32.76   33.59 
## 
## lowest : 25.89 26.07 26.23 27.43 27.60
## highest: 33.59 33.59 33.86 35.20 35.21 
## ---------------------------------------------------------------------------
## Longitude 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 4119327       0      66       1  -98.36 -106.40 -103.18 -100.45  -97.43 
##     .75     .90     .95 
##  -95.33  -94.86  -94.09 
## 
## lowest : -106.50 -106.50 -106.46 -106.40 -106.29
## highest:  -94.17  -94.09  -94.08  -93.91  -93.87 
## ---------------------------------------------------------------------------
## Time_Local 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 4119327       0      24       1    11.5       1       2       5      11 
##     .75     .90     .95 
##      18      21      22 
## 
## lowest :  0  1  2  3  4, highest: 19 20 21 22 23 
## ---------------------------------------------------------------------------
## pm25 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 4119327       0    1671       1   9.514     2.1     3.0     5.1     8.1 
##     .75     .90     .95 
##    12.2    17.4    21.5 
## 
## lowest :  -2.1  -1.9  -1.8  -1.7  -1.4
## highest: 617.1 719.4 755.6 757.0 821.9 
## ---------------------------------------------------------------------------
## pm10 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  316588 3802739     769       1   28.27       7       9      14      21 
##     .75     .90     .95 
##      31      48      67 
## 
## lowest :   -4   -3   -2   -1    0, highest: 4186 4419 4526 4686 4739 
## ---------------------------------------------------------------------------
## wind_Knots 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 3356199  763128     402       1   5.838     1.1     1.7     3.1     5.2 
##     .75     .90     .95 
##     7.8    10.7    12.7 
## 
## lowest :  0.00  0.05  0.10  0.20  0.30
## highest: 42.60 43.00 43.10 43.60 44.00 
## ---------------------------------------------------------------------------
## wind_Degrees_Compass 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 3356199  763128    3602       1     170    20.0    42.0   111.3   160.1 
##     .75     .90     .95 
##   222.9   319.0   340.0 
## 
## lowest :   0.00   0.05   0.10   0.20   0.30
## highest: 359.60 359.70 359.80 359.90 360.00 
## ---------------------------------------------------------------------------
## temperature 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 3377228  742099     113       1   68.93      40      46      58      71 
##     .75     .90     .95 
##      81      88      92 
## 
## lowest :   1   2   3   4   5, highest: 109 110 111 112 114 
## ---------------------------------------------------------------------------
## pressure 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  265992 3853335     109       1   998.8     889     893    1010    1015 
##     .75     .90     .95 
##    1020    1025    1028 
## 
## lowest :  871  872  873  874  875, highest: 1044 1045 1046 1047 1048 
## ---------------------------------------------------------------------------
## RH_Dewpoint_Percent_relative_humidity 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  971442 3147885     100       1   60.29      18      26      43      64 
##     .75     .90     .95 
##      80      89      92 
## 
## lowest :   1   2   3   4   5, highest:  96  97  98  99 100 
## ---------------------------------------------------------------------------
## RH_Dewpoint_Degrees_Fahrenheit 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  971442 3147885      86       1   51.49      18      24      37      56 
##     .75     .90     .95 
##      67      72      74 
## 
## lowest :  0.00  0.05  1.00  2.00  3.00
## highest: 80.00 81.00 82.00 84.00 94.00 
## ---------------------------------------------------------------------------
## Date 
##          n    missing     unique       Info       Mean        .05 
##    4119327          0       3591          1 2010-02-22 2005-07-29 
##        .10        .25        .50        .75        .90        .95 
## 2006-03-10 2007-11-20 2010-03-22 2012-07-16 2013-11-17 2014-04-25 
## 
## lowest : 2005-01-01 2005-01-02 2005-01-03 2005-01-04 2005-01-05
## highest: 2014-10-27 2014-10-28 2014-10-29 2014-10-30 2014-10-31 
## ---------------------------------------------------------------------------
## Year 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
## 4119327       0      10    0.99    2010    2005    2006    2007    2010 
##     .75     .90     .95 
##    2012    2013    2014 
## 
##             2005   2006   2007   2008   2009   2010   2011   2012   2013
## Frequency 347211 345748 380542 435660 448561 455735 437691 442011 472342
## %              8      8      9     11     11     11     11     11     11
##             2014
## Frequency 353826
## %              9
## ---------------------------------------------------------------------------

Remove locations with missing data

by_site <- DF %>% group_by(County_Code, Site_Num, Year) %>%
    summarise(avg_pm = mean(pm25),
        pm10_missing = sum(is.na(pm10))/length(Site_Num),
        pressure_missing = sum(is.na(pressure))/length(Site_Num),
        humidity_missing = sum(is.na(RH_Dewpoint_Percent_relative_humidity))/length(Site_Num),
        dewpoint_degreesf_missing = sum(is.na(RH_Dewpoint_Degrees_Fahrenheit))/length(Site_Num),
        wind_speed_missing = sum(is.na(wind_Knots))/length(Site_Num),
        wind_direction_missing = sum(is.na(wind_Degrees_Compass)) / length(Site_Num),
        temp_missing = sum(is.na(temperature)) / length(Site_Num),
        count = length(County_Code))

# only look at sites with less than 10% missing in some year
non_missing = unique(by_site[by_site$wind_speed_missing < .1 &
        by_site$wind_direction_missing < .1 &
        by_site$temp_missing < .1, c("County_Code", "Site_Num")])

DF <- inner_join(DF, non_missing, by = c("County_Code", "Site_Num"))

map locations:

library(ggmap)
## Warning: package 'ggmap' was built under R version 3.1.3
map <- get_map(location = 'texas', zoom=6, color="bw")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=texas&zoom=6&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=texas&sensor=false
p <- ggmap(map)
agg_df <- DF %>%
    group_by(Latitude, Longitude) %>%
    summarise(ct = length(pm25))
p + geom_point(data=agg_df, aes(x=Longitude, y=Latitude), color="red")

cluster locations

n_clusters <- 6
X <- as.matrix(agg_df[,1:2])

#  K-Means clustering, scale the data matrix.
clusters <- kmeans(scale(X), n_clusters, iter.max = 25)

#  Add cluster label to the data frame.
agg_df$cluster <- factor(clusters$cluster)

#  Examine the cluster averages (use unlogged, unscaled variable values).
centers <- agg_df %>% group_by(cluster) %>%
    summarise(Latitude = mean(Latitude),
        Longitude = mean(Longitude),
        num_unique = length(ct),
        num = sum(ct))
centers <- centers[order(centers$num_unique, decreasing=TRUE),]
print(xtable(centers), type='html')
cluster Latitude Longitude num_unique num
1 4 29.77 -94.93 14 835815
2 6 32.53 -96.67 13 683680
3 5 29.56 -99.25 12 1042468
4 3 31.64 -105.11 8 445118
5 2 26.87 -97.52 6 332918
6 1 33.68 -100.76 3 124694

plot clusters

p + geom_point(data=agg_df, aes(x=Longitude, y=Latitude, colour=cluster), size=3) +
    theme(legend.position = "none")

sizable_clusters <- centers[centers$num_unique > 4,]
plot_df <- agg_df[agg_df$cluster %in% sizable_clusters$cluster,]
p + geom_hex(data=plot_df,
    aes(x=Longitude, y=Latitude, fill=cluster),
        colour = "white", alpha = .6) +
    geom_point(data = sizable_clusters,
        aes(x = Longitude,
            y = Latitude),
        colour = "black",
        alpha=.6,
        size = 16) +
    geom_point(data = sizable_clusters,
        aes(x = Longitude,
            y = Latitude,
            colour = cluster),
        alpha=.6,
        size = 15) +
    geom_text(data = sizable_clusters,
        aes(x = Longitude,
            y = Latitude,
            label = paste0("#", cluster, ": ", num_unique)),
        size=4,
        colour = "black") +
    labs(title = "Spatial Clusters") +
    theme(legend.position = "none")

zoom in on largest cluster:

biggest_cluster <- agg_df[agg_df$cluster %in% centers$cluster[1],]

area <- get_map(location = c(lon=mean(biggest_cluster$Longitude),
    lat=mean(biggest_cluster$Latitude)),
    zoom=9,
    color="bw")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=29.770063,-94.930636&zoom=9&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
p <- ggmap(area)
p + geom_point(data=biggest_cluster, aes(x=Longitude, y=Latitude), color="red")

Below is a summary of the data in this location

area_df = DF[DF$Latitude >= min(biggest_cluster$Latitude) &
        DF$Latitude <= max(biggest_cluster$Latitude) &
        DF$Longitude >= min(biggest_cluster$Longitude) &
        DF$Longitude <= max(biggest_cluster$Longitude),]
area_df$County_Code <- factor(area_df$County_Code)
area_df$Site_Num <- factor(area_df$Site_Num)
area_df$Year <- factor(area_df$Year)
area_df$State_Code <- NULL
area_df$pm10 <- NULL
describe(area_df)
## area_df 
## 
##  14  Variables      835815  Observations
## ---------------------------------------------------------------------------
## County_Code 
##       n missing  unique 
##  835815       0       4 
## 
## 167 (77007, 9%), 201 (546210, 65%), 245 (137177, 16%) 
## 339 (75421, 9%) 
## ---------------------------------------------------------------------------
## Site_Num 
##       n missing  unique 
##  835815       0      12 
## 
##              14    20    22    24    26   58    78   416   1034  1035
## Frequency 18811 28563 79253 79738 68641 9423 75421 63420 138923 81740
## %             2     3     9    10     8    1     9     8     17    10
##            1039   1050
## Frequency 81770 110112
## %            10     13
## ---------------------------------------------------------------------------
## Latitude 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  835815       0      14    0.99   29.78   29.25   29.58   29.67   29.77 
##     .75     .90     .95 
##   29.90   30.07   30.35 
## 
##           29.254474 29.263319 29.583047 29.670025 29.686389 29.733726
## Frequency     58196     18811     80751     81770     63420     81740
## %                 7         2        10        10         8        10
##           29.767971 29.770698 29.802707 29.863953 29.901037 30.06607
## Frequency     80727      9423     68641     79253     79738    28563
## %                10         1         8         9        10        3
##           30.06717 30.350302
## Frequency    29361     75421
## %                4         9
## ---------------------------------------------------------------------------
## Longitude 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  835815       0      14    0.99  -95.02  -95.43  -95.33  -95.29  -95.13 
##     .75     .90     .95 
##  -94.86  -94.32  -94.09 
## 
##           -95.425128 -95.326125 -95.294722 -95.257593 -95.220587
## Frequency      75421      79738      63420      81740      80727
## %                  9         10          8         10         10
##           -95.128508 -95.125495 -95.031232 -95.015544 -94.861289
## Frequency      81770      68641       9423      80751      58196
## %                 10          8          1         10          7
##           -94.856568 -94.3178 -94.09093 -94.077383
## Frequency      18811    79253     29361      28563
## %                  2        9         4          3
## ---------------------------------------------------------------------------
## Time_Local 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  835815       0      24       1   11.51       1       2       5      12 
##     .75     .90     .95 
##      18      21      22 
## 
## lowest :  0  1  2  3  4, highest: 19 20 21 22 23 
## ---------------------------------------------------------------------------
## pm25 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  835815       0     928       1   11.06     3.0     4.2     6.5     9.7 
##     .75     .90     .95 
##    14.1    19.6    23.7 
## 
## lowest :  -1.8  -1.4  -1.2  -1.1  -1.0
## highest: 281.8 282.0 310.0 358.2 755.6 
## ---------------------------------------------------------------------------
## wind_Knots 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  827008    8807     276       1   5.222     1.0     1.5     2.8     4.8 
##     .75     .90     .95 
##     7.1     9.5    11.0 
## 
## lowest :  0.00  0.05  0.10  0.20  0.30
## highest: 27.90 28.00 28.80 28.90 32.40 
## ---------------------------------------------------------------------------
## wind_Degrees_Compass 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  827008    8807    3602       1   163.3    17.0    34.0   100.4   157.0 
##     .75     .90     .95 
##   212.0   321.4   342.5 
## 
## lowest :   0.00   0.05   0.10   0.20   0.30
## highest: 359.60 359.70 359.80 359.90 360.00 
## ---------------------------------------------------------------------------
## temperature 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  829185    6630      92       1   70.25      44      50      61      73 
##     .75     .90     .95 
##      81      86      89 
## 
## lowest :  18  19  20  21  22, highest: 105 106 107 110 114 
## ---------------------------------------------------------------------------
## pressure 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  221958  613857      56       1    1017    1008    1010    1013    1017 
##     .75     .90     .95 
##    1021    1026    1028 
## 
## lowest :  992  994  995  996  997, highest: 1044 1045 1046 1047 1048 
## ---------------------------------------------------------------------------
## RH_Dewpoint_Percent_relative_humidity 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  436860  398955      97       1   69.53      34      43      58      73 
##     .75     .90     .95 
##      84      90      93 
## 
## lowest :   4   5   6   7   8, highest:  96  97  98  99 100 
## ---------------------------------------------------------------------------
## RH_Dewpoint_Degrees_Fahrenheit 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##  436860  398955      85       1   58.96      29      36      49      64 
##     .75     .90     .95 
##      71      74      75 
## 
## lowest :  0.00  0.05  1.00  2.00  3.00
## highest: 79.00 80.00 81.00 82.00 84.00 
## ---------------------------------------------------------------------------
## Date 
##          n    missing     unique       Info       Mean        .05 
##     835815          0       3554          1 2009-11-26 2005-07-02 
##        .10        .25        .50        .75        .90        .95 
## 2006-01-19 2007-06-13 2009-11-18 2012-05-21 2013-10-30 2014-04-13 
## 
## lowest : 2005-01-01 2005-01-02 2005-01-03 2005-01-04 2005-01-05
## highest: 2014-09-26 2014-09-27 2014-09-28 2014-09-29 2014-09-30 
## ---------------------------------------------------------------------------
## Year 
##       n missing  unique 
##  835815       0      10 
## 
##            2005  2006  2007  2008  2009  2010  2011  2012  2013  2014
## Frequency 79589 88143 92553 85834 81709 81517 83636 86026 89300 67508
## %            10    11    11    10    10    10    10    10    11     8
## ---------------------------------------------------------------------------

The area containing this cluster has Longitude = (-95.43, -94.08), and Latitude = (29.25, -)